Bigcontest_EDA
0. Library Packages
library(dplyr)
options(dplyr.summarise.inform = FALSE)
library(tidyr)
library(lubridate)
library(ggplot2)
library(lubridate)
library(plotly)1. Data load
loan_result <- read.csv("../../../data/loan_result.csv")
log_data <- read.csv("../../../data/log_data.csv")
user_spec <- read.csv("../../../data/user_spec.csv")데이터 살펴보기
head(user_spec)## application_id user_id birth_year gender insert_time credit_score
## 1 1249046 118218 1985 1 2022-06-07 06:28:18 660
## 2 954900 553686 1968 1 2022-06-07 14:29:03 870
## 3 137274 59516 1997 1 2022-06-07 21:40:22 710
## 4 1570936 167320 1989 1 2022-06-07 09:40:27 820
## 5 967833 33400 2000 1 2022-06-07 08:55:07 630
## 6 1559350 746993 1994 1 2022-06-07 09:55:03 600
## yearly_income income_type company_enter_month employment_type
## 1 1.08e+08 PRIVATEBUSINESS 20151101 기타
## 2 3.00e+07 PRIVATEBUSINESS 20070201 정규직
## 3 3.00e+07 FREELANCER 20210901 기타
## 4 6.20e+07 EARNEDINCOME 20170101 정규직
## 5 3.60e+07 EARNEDINCOME 20210901 정규직
## 6 3.50e+07 FREELANCER 20160401 기타
## houseown_type desired_amount purpose personal_rehabilitation_yn
## 1 자가 1e+06 기타 0
## 2 기타가족소유 3e+07 대환대출 0
## 3 기타가족소유 1e+07 생활비 0
## 4 자가 2e+06 생활비 0
## 5 기타가족소유 5e+06 생활비 0
## 6 기타가족소유 5e+06 생활비 0
## personal_rehabilitation_complete_yn existing_loan_cnt existing_loan_amt
## 1 NA 4 1.62e+08
## 2 NA 1 2.70e+07
## 3 NA 5 1.50e+07
## 4 NA 7 3.44e+08
## 5 0 1 1.60e+07
## 6 NA 1 NA
데이터 결측치 확인
colSums(is.na(user_spec))## application_id user_id
## 0 0
## birth_year gender
## 12961 12961
## insert_time credit_score
## 0 105115
## yearly_income income_type
## 90 0
## company_enter_month employment_type
## 171760 0
## houseown_type desired_amount
## 0 85
## purpose personal_rehabilitation_yn
## 0 587461
## personal_rehabilitation_complete_yn existing_loan_cnt
## 1203354 198556
## existing_loan_amt
## 313774
2. User_spec 데이터 전처리
2.1 birth_year & gender 결측치 처리
# 각 user_id 별로 생일과 성별 종합
user_info <- user_spec %>%
group_by(user_id) %>%
summarise(birth_year = mean(birth_year, na.rm = T), gender = mean(gender, na.rm = T)) %>%
ungroup()
# 모든 데이터에서 NA값인 user_id 개수 확인
colSums(is.na(user_info))## user_id birth_year gender
## 0 6856 6856
# NA 값 제거(6856명) 및 NA 채우기
na_list <- user_info$user_id[is.na(user_info$birth_year)]
user_spec2 <- user_spec[!(user_spec$user_id %in% na_list), !names(user_spec) %in% c("birth_year", "gender")] %>% left_join(user_info, by = "user_id")
colSums(is.na(user_spec2))## application_id user_id
## 0 0
## insert_time credit_score
## 0 103481
## yearly_income income_type
## 89 0
## company_enter_month employment_type
## 169287 0
## houseown_type desired_amount
## 0 84
## purpose personal_rehabilitation_yn
## 0 583589
## personal_rehabilitation_complete_yn existing_loan_cnt
## 1195440 193850
## existing_loan_amt birth_year
## 308314 0
## gender
## 0
2.2 income_type
income_type이 공백인 값들에 “BLANK” 입력
user_spec2$income_type[user_spec2$income_type == ""] <- "BLANK"
unique(user_spec2$income_type)## [1] "PRIVATEBUSINESS" "FREELANCER" "EARNEDINCOME" "OTHERINCOME"
## [5] "EARNEDINCOME2" "PRACTITIONER" "BLANK"
2.3 yearly_income
Na 값을 가진 5 행 제거
user_spec3 <- user_spec2[!(is.na(user_spec2$yearly_income)),]2.4 company_enter_month
우선 company_enter_month가 Na인 경우 0 대입, 이후 입력 형태가 YYYYMM 가 아닌 값들 수정
user_spec3$company_enter_month[is.na(user_spec3$company_enter_month)] <- 0
user_spec3$company_enter_month[user_spec3$company_enter_month > 1000000] <- user_spec3$company_enter_month[user_spec3$company_enter_month > 1000000] %/% 1002.5 purpose
영어로 입력된 값을 한글로 병합
unique(user_spec3$purpose)## [1] "기타" "대환대출" "생활비" "사업자금" "주택구입"
## [6] "전월세보증금" "투자" "LIVING" "SWITCHLOAN" "ETC"
## [11] "INVEST" "자동차구입" "BUSINESS" "BUYCAR" "HOUSEDEPOSIT"
## [16] "BUYHOUSE"
user_spec3$purpose[user_spec3$purpose == "LIVING"] <- "생활비"
user_spec3$purpose[user_spec3$purpose == "SWITCHLOAN"] <- "대환대출"
user_spec3$purpose[user_spec3$purpose == "ETC"] <- "기타"
user_spec3$purpose[user_spec3$purpose == "INVEST"] <- "투자"
user_spec3$purpose[user_spec3$purpose == "BUSINESS"] <- "사업자금"
user_spec3$purpose[user_spec3$purpose == "BUYCAR"] <- "자동차구입"
user_spec3$purpose[user_spec3$purpose == "HOUSEDEPOSIT"] <- "전월세보증금"
user_spec3$purpose[user_spec3$purpose == "BUYHOUSE"] <- "주택구입"2.6 desired_amount
Na 값을 가진 74 행 제거
user_spec4 <- user_spec3[!(is.na(user_spec3$desired_amount)),]2.7 existing_loan_cnt
기존 대출 횟수가 Na인 값은 0으로 수정
user_spec4$existing_loan_cnt[is.na(user_spec4$existing_loan_cnt)] <- 02.8 num to factor
num 형태의 변수 중 범주형 데이터들을 factor로 변환
user_spec4$income_type <- as.factor(user_spec4$income_type)
user_spec4$employment_type <- as.factor(user_spec4$employment_type)
user_spec4$houseown_type <- as.factor(user_spec4$houseown_type)
user_spec4$purpose <- as.factor(user_spec4$purpose)
user_spec4$gender <- as.factor(user_spec4$gender)colSums(is.na(user_spec4))## application_id user_id
## 0 0
## insert_time credit_score
## 0 103477
## yearly_income income_type
## 0 0
## company_enter_month employment_type
## 0 0
## houseown_type desired_amount
## 0 0
## purpose personal_rehabilitation_yn
## 0 583505
## personal_rehabilitation_complete_yn existing_loan_cnt
## 1195356 0
## existing_loan_amt birth_year
## 308292 0
## gender
## 0
3. User_spec 데이터 분석
3.1 birth_year
# 전체 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num)) +
geom_line() +
theme_minimal() +
scale_x_continuous(breaks = seq(min(user_spec4$birth_year), max(user_spec4$birth_year), 5)) +
labs(title = "전체 나이 분포")
)# 성별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, gender) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num, group = gender, col = gender)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "성별") +
labs(title = "성별 나이 분포")
)# 수입 종류별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, income_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num, group = income_type, col = income_type)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "수입 종류별") +
labs(title = "수입 종류별 나이 분포")
)# 고용 형태별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, employment_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num, group = employment_type, col = employment_type)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "고용 형태별") +
labs(title = "고용 형태별 나이 분포")
)# 집 종류별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, houseown_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num, group = houseown_type, col = houseown_type)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "집 종류별") +
labs(title = "집 종류별 나이 분포")
)# 목적별 나이 분포
ggplotly(
user_spec4 %>% group_by(birth_year, purpose) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = birth_year, y = num, group = purpose, col = purpose)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "목적별") +
labs(title = "목적별 나이 분포")
)unique(user_spec4$gender)## [1] 1 0
## Levels: 0 1
3.2 company_enter_month
# 연도별 입사일 도수
user_spec4 %>% mutate(year = company_enter_month %/% 100) %>%
group_by(year) %>% summarise(num = n()) %>%
filter(num > 100)## # A tibble: 38 × 2
## year num
## <dbl> <int>
## 1 0 169199
## 2 1986 306
## 3 1987 380
## 4 1988 426
## 5 1989 557
## 6 1990 912
## 7 1991 991
## 8 1992 1281
## 9 1993 1289
## 10 1994 1356
## # … with 28 more rows
# 전체 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num)) +
geom_line() +
xlim(ym(198601), ym(202211)) +
theme_minimal() +
xlab("date") +
labs(title = "전체 회사 입사일 분포")
)# 성별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month, gender) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = gender, color = gender)) +
geom_line() +
xlim(ym(198601), ym(202211)) +
theme_minimal() +
scale_color_discrete(name = "성별") +
xlab("date") +
labs(title = "성별 회사 입사일 분포")
)# 수입 종류별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month, income_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = income_type, color = income_type)) +
geom_line() +
xlim(ym(198601), ym(202211)) +
theme_minimal() +
scale_color_discrete(name = "수입 종류") +
xlab("date") +
labs(title = "수입 종류별 회사 입사일 분포")
)# 고용 형태별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month, employment_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = employment_type, col = employment_type)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "고용 형태별") +
xlab("date") +
labs(title = "고용 형태별 회사 입사일 분포")
)# 집 종류별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month, houseown_type) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = houseown_type, col = houseown_type)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "집 종류별") +
xlab("date") +
labs(title = "집 종류별 회사 입사일 분포")
)# 목적별 회사 입사일 분포
ggplotly(
user_spec4 %>% filter(company_enter_month > 0) %>%
group_by(company_enter_month, purpose) %>% summarise(num = n()) %>% ungroup() %>%
ggplot(mapping = aes(x = ym(company_enter_month), y = num, group = purpose, col = purpose)) +
geom_line() +
theme_minimal() +
scale_color_discrete(name = "목적별") +
xlab("date") +
labs(title = "목적별 회사 입사일 분포")
)3.3 desired_amount
# box plot 확인
boxplot(user_spec4$desired_amount)# 요약값 확인
summary(user_spec4$desired_amount)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 5.000e+06 1.000e+07 3.031e+07 3.000e+07 1.000e+10
# 상위 5%를 제외한 box plot
boxplot(user_spec4$desired_amount[(user_spec4$desired_amount < quantile(user_spec4$desired_amount, probs = 0.95))])# 상위 5% 제외 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount)) +
geom_density() +
theme_minimal() +
labs(title = "상위 5% 제외 대출 희망 금액 분포")
)# 상위 5% 제외 성별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = gender, color = gender)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "성별") +
labs(title = "상위 5% 제외 성별 대출 희망 금액 분포")
)# 상위 5% 제외 수입 종류별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = income_type, color = income_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "수입 종류") +
labs(title = "상위 5% 제외 수입 종류별 대출 희망 금액 분포")
)# 상위 5% 제외 고용 형태별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = employment_type, color = employment_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "고용 형태별") +
labs(title = "상위 5% 제외 고용 형태별 대출 희망 금액 분포")
)# 상위 5% 제외 집 종류별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = houseown_type, color = houseown_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "집 종류별") +
labs(title = "상위 5% 제외 집 종류별 대출 희망 금액 분포")
)# 상위 5% 제외 목적별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount < quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = purpose, color = purpose)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "목적별") +
labs(title = "상위 5% 제외 목적별 대출 희망 금액 분포")
)quantile(user_spec4$desired_amount[(user_spec4$gender == 0)])## 0% 25% 50% 75% 100%
## 0e+00 5e+06 1e+07 3e+07 1e+10
quantile(user_spec4$desired_amount[(user_spec4$gender == 1)])## 0% 25% 50% 75% 100%
## 0e+00 5e+06 1e+07 3e+07 1e+10
# 상위 5% 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount)) +
geom_density() +
theme_minimal() +
labs(title = "상위 5% 대출 희망 금액 분포")
)# 상위 5% 성별 대출 희망 금액 분포
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
group_by(gender) %>% summarise(num = n(),
q1 = quantile(desired_amount, 0.25),
mid = quantile(desired_amount, 0.5),
q3 = quantile(desired_amount, 0.75),
max = quantile(desired_amount, 1))## # A tibble: 2 × 6
## gender num q1 mid q3 max
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0 14600 100000000 100000000 200000000 10000000000
## 2 1 50603 100000000 100000000 200000000 10000000000
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = gender, color = gender)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "성별") +
labs(title = "상위 5% 성별 대출 희망 금액 분포")
)# 상위 5% 수입 종류별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = income_type, color = income_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "수입 종류") +
labs(title = "상위 5% 수입 종류별 대출 희망 금액 분포")
)# 상위 5% 고용 형태별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = employment_type, color = employment_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "고용 형태별") +
labs(title = "상위 5% 고용 형태별 대출 희망 금액 분포")
)# 상위 5% 집 종류별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = houseown_type, color = houseown_type)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "집 종류별") +
labs(title = "상위 5% 집 종류별 대출 희망 금액 분포")
)# 상위 5% 목적별 대출 희망 금액 분포
ggplotly(
user_spec4 %>% filter(desired_amount > quantile(user_spec4$desired_amount, probs = 0.95)) %>%
ggplot(mapping = aes(x = desired_amount, group = purpose, color = purpose)) +
geom_density() +
theme_minimal() +
scale_color_discrete(name = "목적별") +
labs(title = "상위 5% 목적별 대출 희망 금액 분포")
)